---
title: "manosphere_code_nov2024"
output: html_notebook
---
```{r}
library(tidyverse)
library(extrafont)
library(here)

# Import fonts for visualizations
font_import()
loadfonts(device = "win")

# Automatically find project root directory
here()

# Read in Excel file

manosphere_data_master <- read.csv("manosphere_data_master_anonymized.csv")
manosphere_intercoder_data <- read.csv("manosphere_intercoder_data_anonymized.csv")

# Count number of unique websites in the dataset
unique_name_count <- n_distinct(manosphere_data_master$organization)
print(unique_name_count)
```

```{r}
# Calculate the current availability of websites
unique_name_count_available <- manosphere_data_master %>%
    filter(available_now == "Y") %>%
    summarise(unique_names = n_distinct(organization))

(print(unique_name_count_available)/141)*100
```

```{r}
# Calculate the available website that are "limited" (i.e. posts obviously deleted, limited functionlity, etc.)
unique_name_count_limited <- manosphere_data_master %>%
    filter(available_now == "L") %>%
    summarise(unique_names = n_distinct(organization))

(print(unique_name_count_limited)/141)*100
```

```{r}
# Calculate number of websites that were not available, had limited availability or redirected
coded_NDL_count <- manosphere_data_master %>%
    filter(available_now %in% c("N", "D", "L")) %>%
    summarise(unique_names = n_distinct(organization))

print(coded_NDL_count)
```

```{r}
# Calculate complete availability through Wayback Machine
unique_name_count_all_years <- manosphere_data_master %>%
    group_by(organization) %>%
    summarise(total_years = n(),
              available_years = sum(available_wayback == "Y")) %>%
    filter(total_years == available_years) %>%
    summarise(unique_names = n())

print(unique_name_count_all_years)
```

```{r}
# Calculate partial availability through Wayback Machine
partial_availability_count <- manosphere_data_master %>%
    group_by(organization) %>%
    summarise(total_years = n(),
              available_years = sum(available_wayback == "Y")) %>%
    filter(available_years > 0 & available_years < total_years) %>%
    summarise(unique_names = n())

print(partial_availability_count)
```

```{r}
# Calculate no availability through Wayback Machine
not_available_count <- manosphere_data_master %>%
    group_by(organization) %>%
    summarise(total_years = n(),
              not_available = all(available_wayback == "N")) %>%
    filter(not_available == TRUE) %>%
    summarise(unique_names = n())

print(not_available_count)
```

```{r}
# Calculate intercoder reliability for posts (exact and difference >10)

# Calculate agreement metrics
agreement_results_posts <- manosphere_intercoder_data %>%
  # Convert posts to numeric, removing any non-numeric characters
  mutate(posts = as.numeric(posts)) %>%
  # Group by organization and year to get matching pairs
  group_by(organization, year) %>%
  # Only keep groups with at least 2 raters
  filter(n() >= 2) %>%
  summarize(
    # Calculate differences between posts within each group
    exact_agreement = n_distinct(posts, na.rm = TRUE) == 1,
    max_diff = if(all(!is.na(posts))) max(posts) - min(posts) else NA,
    .groups = 'drop'
  ) %>%
  summarize(
    exact_agreement_pct = mean(exact_agreement, na.rm = TRUE) * 100,
    agreement_pct = mean(max_diff <= 10, na.rm = TRUE) * 100
  )

# Print results
cat("\nPosts Agreement Analysis:\n")
cat("Percentage of exact agreement:", round(agreement_results_posts$exact_agreement_pct, 1), "%\n")
cat("Percentage of agreement (≤10 posts difference):", round(agreement_results_posts$agreement_pct, 1), "%\n")
```

```{r}
# Calculate intercoder reliability for manosphere type

# Calculate agreement metrics
agreement_results_type <- manosphere_intercoder_data %>%
  # Group by organization and year to get matching pairs
  group_by(organization, year) %>%
  # Only keep groups with at least 2 raters
  filter(n() >= 2) %>%
  summarize(
    # Calculate if types agree within each group
    type_agreement = n_distinct(type_condensed, na.rm = TRUE) == 1,
    .groups = 'drop'
  ) %>%
  summarize(
    agreement_pct = mean(type_agreement, na.rm = TRUE) * 100
  )

# Print results
cat("\nType Agreement Analysis:\n")
cat("Percentage of agreement:", round(agreement_results_type$agreement_pct, 1), "%\n")

```

```{r}
# Create bar graph that shows the number of new manosphere websites each year (Figure 1)  

figure1 <- manosphere_data_master %>%
    filter(organization != "Looksmax.org", year != 2021) %>%
    group_by(organization) %>%
    slice_min(year) %>%
    ggplot(aes(year)) + 
    geom_histogram(binwidth = 1, 
                  color = "white", 
                  fill = "#00BFC4") + 
    scale_x_continuous(n.breaks = 11) + 
    scale_y_continuous(n.breaks = 10) + 
    theme_bw() + 
    labs(x = "Year", 
         y = "Number of Sites") + 
    theme(text = element_text(size = 18, 
                             family = 'Garamond')) + 
    scale_color_manual(values = "#00BFC4")

figure1
```

```{r}
# Create stack bar graph to show distribution of different subgroups in the manosphere over time (Figure 2)

figure2 <- manosphere_data_master %>%
    filter(organization != "Looksmax.org", year != 2021) %>%
    group_by(year) %>%
    ggplot(aes(x = year, fill = type_condensed)) + 
    geom_bar(position = "fill") + 
    scale_x_continuous(n.breaks = 12) + 
    scale_y_continuous(n.breaks = 10) + 
    theme_bw() + 
    labs(x = "Year", y = "Percentage of Sample", fill = "Subgroup") + 
    theme(text = element_text(size = 18, family = 'Garamond')) + 
    scale_fill_discrete(limits = c("CMR", "Incel", "MGTOW", "MRA", "PUA", NA),
                       labels = c("CMR", "Incel", "MGTOW", "MRA", "PUA", "Unclassified"))
figure2

```

```{r}
# Create a line graph showing the number of posts from each subgroup across time (Figure 3)

manosphere_data_master$posts <- as.numeric(manosphere_data_master$posts)

figure3 <- manosphere_data_master %>%
    filter(organization != "Looksmax.org", year != 2021) %>%
    group_by(year, type_condensed) %>%
    summarise(sum = sum(posts, na.rm = TRUE)) %>%
    ggplot(aes(x = year, y = sum, colour = type_condensed)) + 
    geom_line() + 
    scale_x_continuous(n.breaks = 12) + 
    scale_y_continuous(n.breaks = 10) + 
    theme_bw() + 
    labs(x = "Year", y = "Number of Posts", color = "Subgroup") + 
    theme(text = element_text(size = 18, family = 'Garamond')) + 
    scale_color_discrete(limits = c("CMR", "Incel", "MGTOW", "MRA", "PUA", NA),
                        labels = c("CMR", "Incel", "MGTOW", "MRA", "PUA", "Unclassified"))

figure3
```

```{r}
# Create a bar graph that shows the different types of social media used by the manosphere across time (Figure 4, Appendix)

# Replace NA values with "NA" string for social media columns
manosphere_data_master <- manosphere_data_master %>%
  mutate_at(c("twitter", "facebook", "parler", "gab", "instagram", "youtube"),
            ~replace(., is.na(.), "NA"))

# Create long format data for social media platforms
pivot <- manosphere_data_master %>%
    filter(social_media == "Y") %>%
    group_by(year) %>%
    pivot_longer(
        cols = c("twitter", "facebook", "parler", "gab", "instagram", "youtube", "other_sm"),
        names_to = "sm_type",
        values_to = "count"
    )

# Convert values to binary and handle NA cases
pivot$count <- ifelse(pivot$count != "NA", 1, pivot$count)
pivot$count[is.na(pivot$count)] <- "NA"

# Apply historical constraints based on platform launch years
pivot <- pivot %>%
  mutate(count = case_when(
    sm_type == "facebook" & year < 2004 ~ "NA",
    sm_type == "youtube" & year < 2005 ~ "NA",
    sm_type == "twitter" & year < 2006 ~ "NA",
    sm_type == "instagram" & year < 2010 ~ "NA",
    sm_type == "gab" & year < 2016 ~ "NA",
    sm_type == "parler" & year < 2018 ~ "NA",
    TRUE ~ count
  ))

# Create visualization
figure4 <- pivot %>%
    filter(count != "NA", year != "2021", social_media == "Y") %>%
    ggplot(aes(
        x = year, 
        fill = factor(
            sm_type, 
            levels = c("facebook", "gab", "instagram", "parler", "twitter", "youtube", "other_sm")
        )
    )) + 
    geom_bar(position = position_dodge()) + 
    scale_x_continuous(n.breaks = 9) + 
    theme_bw() + 
    labs(
        x = "Year", 
        y = "Social Media Utilized", 
        fill = "Type of Social Media"
    ) + 
    scale_fill_discrete(
        breaks = c("facebook", "gab", "instagram", "parler", "twitter", "youtube", "other_sm"),
        labels = c("Facebook", "Gab", "Instagram", "Parler", "Twitter", "Youtube", "Other Social Media")
    ) + 
    theme(text = element_text(size = 18, family = 'Garamond'))

figure4

```

```{r}
# Sums for political action for table (Table 1, Appendix)

# Sum political action total
total_pa_sum <- sum(manosphere_data_master$activities_count, na.rm = TRUE)

grouped_sum <- manosphere_data_master %>%
      group_by(type_condensed) %>%
      summarise(sum_values = sum(activities_count, na.rm = TRUE))

grouped_sum <- grouped_sum %>%
  mutate(percentage = (sum_values / total_pa_sum) * 100)

# Sum political action total (no books)
total_pa_sum_nobooks <- sum(manosphere_data_master$activities_count_nobooks, na.rm = TRUE)

grouped_sum_nobooks <- manosphere_data_master %>%
      group_by(type_condensed) %>%
      summarise(sum_values_nobooks = sum(activities_count_nobooks, na.rm = TRUE))

grouped_sum_nobooks <- grouped_sum_nobooks %>%
  mutate(percentage_nobooks = (sum_values_nobooks / total_pa_sum_nobooks) * 100)

```

